1
library(datasets) # Load iris data
library(ggplot2)
library(nnet) # For multinomial logistic regression
library(naivebayes)
## naivebayes 0.9.7 loaded
library(adabag) # For AdaBoost
## Loading required package: rpart
## Loading required package: caret
## Loading required package: lattice
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(rpart) # For AdaBoost
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x purrr::when() masks foreach::when()
library(ggpubr)
library(caret)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
1a Boxplots, Correlation Matrix, Scatter Plots of Features Pairs:
data <- iris
#Boxplots
boxplot(data$Sepal.Length, main = "Sepal Length")
boxplot(data$Sepal.Width, main = "Sepal Width")
boxplot(data$Petal.Length, main = "Petal Length")
boxplot(data$Petal.Width, main = "Petal Width")
#boxplot(data$Species)
#Correlation Matrix
cor(data[1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
#scatterplots
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species)) + geom_point()
ggplot(iris, aes(x=Sepal.Length, y=Petal.Length, color = Species)) + geom_point()
ggplot(iris, aes(x=Sepal.Length, y=Petal.Width, color = Species)) + geom_point()
ggplot(iris, aes(x=Sepal.Width, y=Petal.Length, color = Species)) + geom_point()
ggplot(iris, aes(x=Sepal.Width, y=Petal.Width, color = Species)) + geom_point()
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color = Species)) + geom_point()
1b:
Use X1, X2, X3 to fit the multinomial logistic regression, the Naive Bayes classifier, and the AdaBoost with 30 Decision Tree classifiers with maximum depth 3. Plot the decision surfaces of the 3 classifiers on (X1, X2) and (X1, X3) respectively.
# Fit a multinomial logistic regression
multinom_model <- multinom(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data=iris)
## # weights: 15 (8 variable)
## initial value 164.791843
## iter 10 value 21.394999
## iter 20 value 13.010468
## iter 30 value 11.932227
## iter 40 value 11.899523
## iter 50 value 11.886536
## final value 11.886441
## converged
# Generate grid points for plotting decision surfaces
r <- sapply(iris[, c(1,2,3)], range, na.rm = TRUE)
x1s <- seq(r[1,1], r[2,1], length.out = 500)
x3s <- seq(r[1,2], r[2,2], length.out = 500)
x4s <- seq(r[1,3], r[2,3], length.out = 500)
grid <- cbind(rep(x1s, each=500), rep(x3s, time = 500), rep(x4s, time = 500))
colnames(grid) <- colnames(r)
grid <- as.data.frame(grid)
# Predictions from the generated grid points
multinom_pred <- predict(multinom_model, newdata=grid, type="class")
# For Naive Bayes, simply replace the lines for model fitting and prediction with:
nb_model <- naive_bayes(Species ~ Sepal.Length + Sepal.Width + Petal.Length, data = iris)
nb_pred <- predict(nb_model, grid, type = "class")
# Plot decision surfaces for (X1, X2)
mult_ds1 <- ggplot() +
geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=multinom_pred), show.legend = F) +
theme_bw()
nb_ds1 <- ggplot() +
geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=nb_pred), show.legend = F) +
theme_bw()
boost_ds1 <- ggplot() +
geom_point(data = iris, aes(x=Sepal.Length, y=Sepal.Width, color = Species), size = 2, show.legend = F) +
geom_raster(alpha=0.1, aes(x = grid[, 1],y = grid[, 2], fill=boost_pred), show.legend = F) +
theme_bw()
# For AdaBoost:
boost_model <- boosting(Species ~ Sepal.Length + Sepal.Width + Petal.Length,
data = iris, mfinal = 30, control = rpart.control(maxdepth = 3))
boost_pred <- predict(boost_model, grid, type = "class")$class
ggarrange(mult_ds1, nb_ds1, boost_ds1, ncol = 2, nrow = 2)
1c:
Based on the decision surfaces, compare and summarize the advantage and disadvantage of each classifier.
2a:
Creating correlated errors through time. Please write a function that takes in an integer, n ≥1, then produces “errors” that are generated in the following way:
generate_errors <- function(n){
errors <- c()
for (i in 1:n) {
if(i == 1){
errors <- c(errors, rnorm(1, 0, 1))
}else{
errors <- c(errors, rnorm(1, mean = last(errors), sd = 1))
}
}
return(errors)
}
2b
#generate values
corr1 <- generate_errors(500)
rep1 <- rep(1, 500)
corr2 <- generate_errors(500)
rep2 <- rep(2, 500)
corr3 <- generate_errors(500)
rep3 <- rep(3, 500)
corr4 <- generate_errors(500)
rep4 <- rep(4, 500)
corr5 <- generate_errors(500)
rep5 <- rep(5, 500)
#create a df
df <- data_frame(corr1, rep1)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df2 <- data_frame(corr2, rep2)
df3 <- data_frame(corr3, rep3)
df4 <- data_frame(corr4, rep4)
df5 <- data_frame(corr5, rep5)
names(df) <- c("cor", "trial")
names(df2) <- c("cor", "trial")
names(df3) <- c("cor", "trial")
names(df4) <- c("cor", "trial")
names(df5) <- c("cor", "trial")
final_df <- rbind(df, df2, df3, df4, df5)
final_df$index <- rep(1:2500)
library(RColorBrewer)
#correlated errors plot
cor_plot <- ggplot(final_df, aes(x=index, y = cor, color = trial)) + geom_point()+ scale_fill_brewer(palette="Dark2")
#uncorrelated errors
uncorr1 <- rnorm(500, 0, 1)
unrep1 <- rep(1, 500)
uncorr2 <- rnorm(500, 0, 1)
unrep2 <- rep(2, 500)
uncorr3 <- rnorm(500, 0, 1)
unrep3 <- rep(3, 500)
uncorr4 <- rnorm(500, 0, 1)
unrep4 <- rep(4, 500)
uncorr5 <- rnorm(500, 0, 1)
unrep5 <- rep(5, 500)
#create df
df <- data_frame(uncorr1, unrep1)
df2 <- data_frame(uncorr2, unrep2)
df3 <- data_frame(uncorr3, unrep3)
df4 <- data_frame(uncorr4, unrep4)
df5 <- data_frame(uncorr5, unrep5)
names(df) <- c("cor", "trial")
names(df2) <- c("cor", "trial")
names(df3) <- c("cor", "trial")
names(df4) <- c("cor", "trial")
names(df5) <- c("cor", "trial")
uncor_df <- rbind(df, df2, df3, df4, df5)
uncor_df$index <- rep(1:2500)
#uncorrelated errors plot
uncor_plot <- ggplot(uncor_df, aes(x=index, y = cor, color = trial)) + geom_point()+ scale_fill_brewer(palette="Dark2")
#show graphs
cor_plot
uncor_plot
rnoise <- function(n) {
eps <- vector(length=n)
eps[1] <- rnorm(1, 0, 1)
for (i in 2:n){
#eps[i]<- rnorm(1, eps[i-1], 1)
eps[i]<- eps[i-1] + rnorm(1, 0, 1)
}
return(eps)
}
num_simulation <- 5
correlated_noise <- replicate(num_simulation, rnoise(n = 500))
independent_noise <- replicate(num_simulation, rnorm(n = 500))
par(mfrow=c(1,2))
matplot(correlated_noise, type="l", lty = 1, pch=19, ylab="noise", xlab="n", main="Correlated")
matplot(independent_noise, type="l", lty = 1, pch=19, ylab="noise", xlab="n", main="Independent")
2c
Comment on the difference. According to the plots you’ve created, please answer the following:
Do both types of errors have mean 0 overall (1 sentence)? What could you do to be more certain (1 sentence)?
The uncorrelated errors do have mean 0 overall, while the uncorrelated errors definitely do not. We could calculate the average column values in the dataframe and get this information quickly.
Uncorrelated:
mean(uncor_df$cor)
## [1] -0.0006140347
Correlated:
mean(final_df$cor)
## [1] -0.4143909
We see that the correlated has a mean of 1.39 (across each group) and that the uncorrelated errors have a near zero mean of -0.001236516.
Which type of errors has a larger variance or are they comparable?
var(final_df$cor)
## [1] 217.82
var(uncor_df$cor)
## [1] 1.009564
The correlated errors have a variance of 477.7241 while the uncorrelated errors have a variance of 1.003073. This makes sense as the uncorrelated errors are normally distributed and should have a standard deviation of about 1, and the standard deviation is the square root of the variance.
The uncorrelated errors definitely have a higher variance.
2d
How correlated errors affect the ordinary least squared regression. Please simulate data from the usual linear model Yi = β0 + β1xi + ϵi, but now ϵi are generated from your correlated function as in part (a). Let x1, x2, . . . , xn be evenly spaced values between 0 and 10 in increments of 0.2 (inclusive of bounds). Set β0 = 1, β1 = 2. At each value of xi, we only observe one yi. For each dataset generated, please fit the OLS and store: • the fitted parameters; • the associated mean and SE for the parameters in the output of summary.lm(). Please create 1000 simulated datasets and estimate the mean and SE of the regression coefficients. Do the mean and SE based on the simulated values “overall agree” with the values from summary.lm()? Please explain your answer. (Hint: Please refer to the R Hints.)
generate_regression <- function(beta) {
X <- cbind(1, seq(0, 10, by = 0.2))
# You should write your own rnoise() for generating correlated errors in part (a)
Y <- X %*% beta + generate_errors(nrow(X))
return( lm(Y ~ X[,2]) ) # Return an object
}
regressions <- replicate(1000, generate_regression(c(1,2)), simplify = FALSE)
# First row will be intercept and the second row will be slope.
fitted_parameters <- sapply(regressions, coef)
computed_mean <- sapply(regressions, function(obj) summary(obj)$coefficients[,"Estimate"])
computed_se <- sapply(regressions, function(obj) summary(obj)$coefficients[,"Std. Error"])
estimated_mean <- apply(fitted_parameters, 1, mean)
estimated_se <- apply(fitted_parameters, 1, sd)
2e
Which of our usual regression conditions are satisfied in our simulation above? Please also state why the conditions are violated:
Linearity is not violated, because this is how the data was generated. As shown in the math above, the errors have 0 conditional expectation. Since we used the rnorm() function to generate the data, the errors are normally distributed. However, crucially the errors are not indendent. They are dependent, as the previous error is used to calculate the next error, as shown in the code above. The errors do not have a constant variance, violating the homoscedasticity assumption. As the sample size increases, the errors also increase, as shown above.
Project Question Series 3
Formulate a statistical inference problem that will help your instructor to predict the travel time of future trips.:
Propose a detailed analytical pipeline to solve the statistical inference problem:
Design the experiment to evaluate the effectiveness of your approach:
flights <- read.csv("pnwflights14.csv")
# Remove NAs
flights <- na.omit(flights)
# Remove nonsense entries
flights <- flights[-which(flights$dep_time < 0),]
flights <- flights[-which(flights$air_time < 0),]
flights <- flights[-which(flights$distance < 0),]
#convert the timing into a date time object
flights$date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights$hour, ":", flights$minute, sep = "")
flights$dep_date <- strptime(flights$date, format="%Y-%m-%d %H:%M")
flights2 <- flights
flights2$arr_time2 <- substr(as.POSIXct(sprintf("%04.0f", flights$arr_time), format='%H%M'), 12, 16)
flights2$arr_date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights2$arr_time2, sep = "")
flights2$arr_date <- strptime(flights2$arr_date, format="%Y-%m-%d %H:%M")
flights$arr_date <- flights2$arr_date
Formulate a statistical inference problem that will help your instructor to predict the travel time of future trips.:
avg_carrier_delays <- flights %>%
group_by(carrier) %>%
summarise(mean_delay= mean(arr_delay), n = n()) %>%
arrange(desc(mean_delay))
avg_carrier_delays
The goal of this problem is to predict the travel time of a future trip. Above I calculated the mean arrival delay between flight carriers. I want to see if there is a true difference in the mean arrival delay between the carrier with the lowest arrival delay and the carrier with the highest arrival delay. This will be useful in determining which carrier is the best to pick to minimize arrival delays.
Propose a detailed analytical pipeline to solve the statistical inference problem:
To evaluate if one carrier is better than another carrier, I will conduct a difference in means test. I will then also create a simple linear regression model, to see if other variables better explain the difference in arrival time than carrier.
Design the experiment to evaluate the effectiveness of your approach:
Pre processing:
flights <- read.csv("/Users/nikhil/Google Drive/My Drive/STAT 3105/HW4/pnwflights14.csv", header = TRUE)
# Remove NAs
flights <- na.omit(flights)
# Remove nonsense entries
flights <- flights[-which(flights$dep_time < 0),]
flights <- flights[-which(flights$air_time < 0),]
flights <- flights[-which(flights$distance < 0),]
#convert the timing into a date time object
flights$date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights$hour, ":", flights$minute, sep = "")
flights$dep_date <- strptime(flights$date, format="%Y-%m-%d %H:%M")
flights2 <- flights
flights2$arr_time2 <- substr(as.POSIXct(sprintf("%04.0f", flights$arr_time), format='%H%M'), 12, 16)
flights2$arr_date <- paste(flights$year, "-", flights$month, "-", flights$day, " ", flights2$arr_time2, sep = "")
flights2$arr_date <- strptime(flights2$arr_date, format="%Y-%m-%d %H:%M")
flights$arr_date <- flights2$arr_date
#recode categorical vars
flights$month <- as.factor(flights$month)
flights$year <- as.factor(flights$year)
flights$day <- as.factor(flights$day)
#There are 3007 tail nos and 145459 observations, so likely corresponds to type of aircraft
flights$tailnum <- as.factor(flights$tailnum)
flights$origin <- as.factor(flights$origin)
flights$dest <- as.factor(flights$dest)
flights$flight <- as.factor(flights$flight)
Regression
Now we will do some exploratory analysis, and make a simple model and try to identify useful features.
plot(flights$arr_date, flights$arr_delay, main = "Arrival Date vs Arrival Delay")
plot(flights$dep_date, flights$arr_delay, main = "Departure Date vs Arrival Delay")
#These variables are highly correlated
plot(flights$dep_delay, flights$arr_delay)
#try and look for collinearity
flights2 <- select(flights, dep_delay, arr_time, arr_delay, air_time, distance)
cor(flights2)
## dep_delay arr_time arr_delay air_time distance
## dep_delay 1.000000000 0.05626588 0.92351496 -0.008110625 -0.003219088
## arr_time 0.056265885 1.00000000 0.06920322 -0.042684748 -0.048964531
## arr_delay 0.923514961 0.06920322 1.00000000 -0.023384820 -0.054510499
## air_time -0.008110625 -0.04268475 -0.02338482 1.000000000 0.985321253
## distance -0.003219088 -0.04896453 -0.05451050 0.985321253 1.000000000
From our correlation matrix, we notice a high degree of correlation between arr_delay and dep_delay, as well as between distance and air_time. We will need to keep this in mind when designing our model, as collinearity violates the assumptions of linear regression.
Now we will remove unnecessary/duplicate features make a simple OLS model, and then evaluate it’s predictive accuracy:
flights2 <- select(flights, -c(month, day, year, date))
flights2$dep_date <- as.Date(flights2$dep_date)
#Test/Train Split
trainIndex <- createDataPartition(flights2$arr_delay, p = .8,
list = FALSE,
times = 1)
train <- flights2[trainIndex,]
test <- flights2[-trainIndex,]
#simple regressions
simple1 <- lm(arr_delay ~ air_time, data = train)
simple1_predict <- predict(simple1, test)
RMSE(simple1_predict, test$arr_delay)
## [1] 30.21482
RMSE = 31.3
simple2 <- lm(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date, data = train)
simple2_predict <- predict(simple2, test)
RMSE(simple2_predict, test$arr_delay)
## [1] 28.88639
RMSE = 30.28
# Define training control
set.seed(123)
train.control <- trainControl(method = "cv", number = 10)
# Train the model
model <- train(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date,data = flights2, method = "lm",trControl = train.control)
#RMSE = 29.88867
print(model)
## Linear Regression
##
## 145459 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130913, 130914, 130913, 130914, 130912, 130912, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 29.88867 0.08199325 15.34389
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
simple1 <- lm(arr_delay ~ air_time, data = flights2)
simple2 <- lm(arr_delay ~ air_time+carrier+origin+distance+dest+dep_date, data = flights2)
simple3 <- lm(arr_delay ~ origin+dest+carrier+distance+dep_delay, data = flights2)
simple4 <- lm(arr_delay ~ origin+dest+carrier+distance+dep_delay+dep_date, data = flights2)
simple5 <- lm(arr_delay ~ origin+carrier+distance+dep_delay+dep_date, data = flights2)
summary(simple1)
summary(simple2)
summary(simple3)
simple2_predict <- predict(simple2, test)
RMSE(simple2_predict, test$arr_delay)
## [1] 28.88639
summary(simple2)
##
## Call:
## lm(formula = arr_delay ~ air_time + carrier + origin + distance +
## dest + dep_date, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.01 -12.97 -5.96 2.96 1544.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.579320 14.843899 -2.262 0.023689 *
## air_time 0.819340 0.009545 85.836 < 2e-16 ***
## carrierAS -3.498470 0.679959 -5.145 2.68e-07 ***
## carrierB6 8.115957 1.060463 7.653 1.98e-14 ***
## carrierDL 1.057128 0.810955 1.304 0.192386
## carrierF9 5.728200 1.051792 5.446 5.16e-08 ***
## carrierHA 6.148580 1.440971 4.267 1.98e-05 ***
## carrierOO -0.589468 0.778483 -0.757 0.448931
## carrierUA -3.493424 0.714200 -4.891 1.00e-06 ***
## carrierUS -4.840288 0.999994 -4.840 1.30e-06 ***
## carrierVX -5.639227 0.945940 -5.962 2.51e-09 ***
## carrierWN 3.907996 0.758773 5.150 2.60e-07 ***
## originSEA 3.880219 0.295462 13.133 < 2e-16 ***
## distance -0.111328 0.003416 -32.591 < 2e-16 ***
## destANC -9.434850 1.580866 -5.968 2.41e-09 ***
## destATL 23.746277 3.541739 6.705 2.03e-11 ***
## destAUS 13.031774 2.827907 4.608 4.06e-06 ***
## destBLI -13.645069 5.320922 -2.564 0.010336 *
## destBNA 10.226522 4.016265 2.546 0.010889 *
## destBOI -16.435050 4.514015 -3.641 0.000272 ***
## destBOS 28.447960 4.543318 6.261 3.83e-10 ***
## destBUR -9.623052 1.693521 -5.682 1.33e-08 ***
## destBWI 24.441247 4.327699 5.648 1.63e-08 ***
## destBZN -21.197221 21.464053 -0.988 0.323366
## destCLE 41.953788 6.052034 6.932 4.17e-12 ***
## destCLT 29.342556 3.972325 7.387 1.51e-13 ***
## destCOS 0.278533 2.286084 0.122 0.903027
## destCVG 13.417821 4.057497 3.307 0.000944 ***
## destDCA 25.249663 4.094159 6.167 6.97e-10 ***
## destDEN -0.120057 1.363113 -0.088 0.929817
## destDFW 15.002323 2.096219 7.157 8.30e-13 ***
## destDTW 17.832479 2.856795 6.242 4.33e-10 ***
## destEUG -19.916251 3.757198 -5.301 1.15e-07 ***
## destEWR 30.043377 4.215103 7.128 1.03e-12 ***
## destFAI -5.977125 1.957491 -3.053 0.002263 **
## destFAT -9.466548 2.293133 -4.128 3.66e-05 ***
## destFLL 30.589430 5.413095 5.651 1.60e-08 ***
## destGEG -14.235325 3.433494 -4.146 3.39e-05 ***
## destHDN 2.770253 6.235607 0.444 0.656853
## destHNL 1.524078 4.995720 0.305 0.760308
## destHOU 15.021542 3.638335 4.129 3.65e-05 ***
## destIAD 30.382142 3.981412 7.631 2.35e-14 ***
## destIAH 17.099249 2.615067 6.539 6.23e-11 ***
## destJAC -9.059583 9.789101 -0.925 0.354720
## destJFK 29.837914 4.303215 6.934 4.12e-12 ***
## destJNU -12.364542 1.820744 -6.791 1.12e-11 ***
## destKOA -1.939786 5.123799 -0.379 0.704998
## destKTN -13.704115 2.281001 -6.008 1.88e-09 ***
## destLAS -10.647381 1.645430 -6.471 9.78e-11 ***
## destLAX -7.734320 1.507715 -5.130 2.90e-07 ***
## destLGB -20.293838 1.695910 -11.966 < 2e-16 ***
## destLIH -2.327093 5.200129 -0.448 0.654510
## destLMT -29.951326 4.227893 -7.084 1.41e-12 ***
## destMCI 7.497107 2.016478 3.718 0.000201 ***
## destMCO 27.138619 4.842572 5.604 2.10e-08 ***
## destMDW 5.275071 2.389440 2.208 0.027270 *
## destMIA 37.486776 5.474645 6.847 7.56e-12 ***
## destMKE 8.281312 2.782996 2.976 0.002924 **
## destMSP 5.214388 1.590726 3.278 0.001046 **
## destMSY 4.222769 3.982483 1.060 0.288994
## destOAK -12.061792 2.134747 -5.650 1.61e-08 ***
## destOGG -0.828197 4.895149 -0.169 0.865650
## destOMA 5.554038 2.309222 2.405 0.016167 *
## destONT -8.405240 1.689515 -4.975 6.54e-07 ***
## destORD 20.026661 2.242249 8.932 < 2e-16 ***
## destPDX -23.056450 3.682913 -6.260 3.85e-10 ***
## destPHL 31.218660 4.206677 7.421 1.17e-13 ***
## destPHX -2.866756 1.320319 -2.171 0.029914 *
## destPSP -9.569943 1.810868 -5.285 1.26e-07 ***
## destRDM -19.035885 3.694111 -5.153 2.57e-07 ***
## destRNO -8.310339 3.067706 -2.709 0.006750 **
## destSAN -7.808575 1.400438 -5.576 2.47e-08 ***
## destSAT 6.567756 2.906068 2.260 0.023822 *
## destSBA -9.048387 2.084459 -4.341 1.42e-05 ***
## destSEA -20.217979 3.494754 -5.785 7.26e-09 ***
## destSFO -8.950743 2.153067 -4.157 3.22e-05 ***
## destSIT -11.474127 3.819219 -3.004 0.002662 **
## destSJC -14.805224 2.080129 -7.117 1.11e-12 ***
## destSLC -10.270665 2.071547 -4.958 7.13e-07 ***
## destSMF -8.079258 2.315200 -3.490 0.000484 ***
## destSNA -13.101940 1.548602 -8.460 < 2e-16 ***
## destSTL 13.722940 2.606096 5.266 1.40e-07 ***
## destTPA 7.090775 5.134051 1.381 0.167243
## destTUS -4.145606 1.807512 -2.294 0.021819 *
## dep_date 0.002728 0.000881 3.096 0.001961 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.16 on 116284 degrees of freedom
## Multiple R-squared: 0.08139, Adjusted R-squared: 0.08073
## F-statistic: 122.7 on 84 and 116284 DF, p-value: < 2.2e-16
lrtest(simple1, simple2)
plot(simple2)
For this problem, I originally thought of using machine learning methods, as I thought that these would be able to make models with very high predictive accuracy. Interestingly I noticed that the model that I settled on had an RMSE of 30.028, whereas my best machine learning model using a 10 k fold split was only able to achieve an RMSE of 29.8.
This lead me to think about the problem in a new way, as the goal of this problem is statistical inference, and thus making a model that fits well is more important than making a model that predicts well. I eventually decided to use OLS as it would be easy to interpret, and I used variables that were likely to have an effect on arrival delay. I avoided using both air_time and distance, as these variables are highly correlated, and I did not want to violate the collinearity assumption.
I chose to fit an OLS regression on arr_delay, with the following variables as predictors: origin,dest,carrier,distance,dep_delay,dep_date. This model achieved an R^2 of 0.86, which is much higher than my simplest model’s R^2 of 0.081. Relative to my simplest model, we can also confirm using the likelihood ratio test that my model fits the data better (p < 0.0001).
From the cook’s vs distance plot, we can see that there are very few influential values, and from the Normal Q-Q plot, we can verify the normality assumption. Most of the variables in this model were statistical significant, with the exception of a few carriers and origin/destination airports.
What this model shows us, is that for this dataset, there exists a relationship between the predictors and arrival delay, and approximately 87% of the variability in the data can be explained by the model.
Difference in Means Test
df <- subset(flights, carrier == "F9" | carrier == "US")
df <- select(df, carrier, arr_delay)
hist(df$arr_delay[df$carrier == "F9"], main = "Distribution of F9 Carrier Arrival Delay",
xlim = range(0:200), xlab = "Arrival Delay (minutes)")
hist(df$arr_delay[df$carrier == "US"], main = "Distribution of US Carrier Arrival Delay",
xlim = range(0:200), xlab = "Arrival Delay (minutes)")
The conditions for conducting a difference in means test are that the samples are obtained randomly. I am not completely sure where this data came from but I will assume for now that the samples are obtained randomly.
Since each row represents a different flight, it is fair to assume that samples are independent, and that one flight does not affect another. In any case where it would like a certain accident on the runway causing delays of other flights, this variability will be mitigated by random sampling.
The histograms above do not appear to depict the sampling distribution as normal. However, since the sample size is greater than 40 for both airlines, we can meet the normality assumption.
t.test(arr_delay ~ carrier, data = df)
##
## Welch Two Sample t-test
##
## data: arr_delay by carrier
## t = 11.405, df = 3405.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F9 and group US is not equal to 0
## 95 percent confidence interval:
## 8.730665 12.355621
## sample estimates:
## mean in group F9 mean in group US
## 9.201161 -1.341982
Our t test returned a p value of < 2.2e-16 for the null hypothesis that the true difference in means is equal to zero. Sample sizes were significantly large with 2411 F9 flights and 5319 US flights. This means that we can reject the null hypothesis, and confirm that the true difference in means is not zero. A 95% confidence interval for the true difference in means was [8.730665, 12.355621] and zero is notably not in the confidence interval.
Conclusion
Our regression model provided a coefficient of 5.669208 for having F9 as carrier and a coefficient of -4.472456 for having US as a carrier, and both coefficients were statistically. Our difference in means test showed that there is definitely a difference in the mean flight time between F9 and US. If trying to make a decision solely based on airline, the professor should definitely pick US airlines, as they are statistically likely to be early compared to other airlines.
However the following histogram shows the distribution of all of the regression coefficients:
hist(coef(simple2), xlab = "coefficient value", main = "Distribution of Regression coefficients")
From observing the distribution, we observe that 5.6 and -4.4 appear to be relatively in the center, meaning that they do not have as big of an effect on arrival time as other variables in the model. While it is likely to be the case that picking US airlines would decrease the flight time, other variables are likely to have a higher chance at decreasing flight time. For example, having Cleavland as a destination has a coefficient of 40, and is much more likely to increase flight time. Additionally, having LMT as a destination has a coefficient of -28 and is much more likely to decrease flight time.